library(caret); library(rattle); library(rpart); library(rpart.plot)
library(randomForest); library(corrplot)
#Load the imported data from local
trainRead<-read.csv("C:/Coursera/08_Practical_Machine_learning/pml-training.csv", na.strings=c("NA","#DIV/0!", ""))
testRead<-read.csv("C:/Coursera/08_Practical_Machine_learning/pml-testing.csv", na.strings=c("NA","#DIV/0!", ""))
dim(trainRead);dim(testRead)
## [1] 19622 160
## [1] 20 160
#Once we want use the columns as predictors, we must eliminate all the columns that do not have information.
trainClean<-trainRead[,colSums(is.na(trainRead))==0]
testClean<-testRead[,colSums(is.na(testRead))==0]
#that reduces the columns to only 60 columns
dim(trainClean);dim(testClean)
## [1] 19622 60
## [1] 20 60
#Investigating the data we can see that the seven first columns have a sequencial number (the first)
#and variations of the timestamp that we are not using for this analysis so we will eliminate those columns remaining 53
trainOK<-trainClean[,-c(1:7)]
testOK<-testClean[,-c(1:7)]
dim(trainOK);dim(testOK)
## [1] 19622 53
## [1] 20 53
#And now we are with the Dataset to proceed the study and will see if there are correlation among the variables used.
exerCorrmatrix<-cor(trainOK[sapply(trainOK, is.numeric)])
tiff(file="corrnew.tif", res=96, width=1000, height=1000)
corrplot(exerCorrmatrix,order="FPC", method="circle", tl.cex=0.45, tl.col="black", number.cex=0.25)
title("Correlation Matrix of the variables used", line = 1)
###Create the datasets
inTrain<-createDataPartition(trainOK$classe, p=3/4, list=FALSE)
train<-trainOK[inTrain,]
valid<-trainOK[-inTrain,]
set.seed(2018)
PropPCA<-preProcess(train,method="pca", thresh=0.8)
PropPCA
## Created from 14718 samples and 53 variables
##
## Pre-processing:
## - centered (52)
## - ignored (1)
## - principal component signal extraction (52)
## - scaled (52)
##
## PCA needed 12 components to capture 80 percent of the variance
#create the preProc object, excluding the response (classe)
preProc <- preProcess(train[,-53],
method = "pca",
pcaComp = 12, thresh=0.8)
#Apply the processing to the train and test data, and add the response
#to the dataframes
train_pca <- predict(preProc, train[,-53])
train_pca$classe <- train$classe
#train_pca has only 12 principal components plus classe
valid_pca <- predict(preProc, valid[,-53])
valid_pca$classe <- valid$classe
#valid_pca has only 12 principal components plus classe
###**Choose algorithms to predict**
#####Two methods will be tested, gbm=Generalized Boosted Regression and rf=Random Forest
### GBM produced the worst result and Once it take a loong time to reprocess, it is dumbed.
#fit_gbm<-train(classe ~., data=train_pca, method="gbm")
#print(fit_gbm, digits=4)
#predict_gbm<-predict(fit_gbm,valid_pca)
#(conf_gbm<-confusionMatrix(valid_pca$classe, predict_gbm))
#(accuracy_gbm<-conf_gbm$overall['Accuracy'])
###rf
fitControl<-trainControl(method="cv", number=5, allowParallel=TRUE)
fit_rf<-train(classe ~., data=train_pca, method="rf", trControl=fitControl)
print(fit_rf, digits=4)
## Random Forest
##
## 14718 samples
## 12 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 11774, 11774, 11775, 11775, 11774
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9540 0.9418
## 7 0.9484 0.9347
## 12 0.9401 0.9243
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
predict_rf<-predict(fit_rf,valid_pca)
(conf_rf<-confusionMatrix(valid_pca$classe, predict_rf))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1365 8 11 9 2
## B 15 903 26 5 0
## C 13 12 814 15 1
## D 11 1 22 766 4
## E 0 7 7 5 882
##
## Overall Statistics
##
## Accuracy : 0.9645
## 95% CI : (0.959, 0.9695)
## No Information Rate : 0.2863
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9551
## Mcnemar's Test P-Value : 0.004879
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9722 0.9699 0.9250 0.9575 0.9921
## Specificity 0.9914 0.9884 0.9898 0.9907 0.9953
## Pos Pred Value 0.9785 0.9515 0.9520 0.9527 0.9789
## Neg Pred Value 0.9889 0.9929 0.9837 0.9917 0.9983
## Prevalence 0.2863 0.1898 0.1794 0.1631 0.1813
## Detection Rate 0.2783 0.1841 0.1660 0.1562 0.1799
## Detection Prevalence 0.2845 0.1935 0.1743 0.1639 0.1837
## Balanced Accuracy 0.9818 0.9792 0.9574 0.9741 0.9937
(accuracy_rf<-conf_rf$overall['Accuracy'])
## Accuracy
## 0.9645188
test_pca <- predict(preProc, testOK[,-53])
test_pca$problem_id <- testOK$problem_id
(predict(fit_rf, test_pca))
## [1] B A A A A E D B A A B C B A E E A B B B
## Levels: A B C D E